{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "zgD8f9t-g6T5"
      },
      "source": [
        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/your-repo/your-notebook.ipynb)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "hTb9WLITg6T6"
      },
      "source": [
        "# ExtractThinker get started\n",
        "\n",
        "This notebook demonstrates how to start using ExtractThinker, using a simple example with Pypdf as the document loader and GPT-4o-mini as the LLM."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "22V5zm43g6T6"
      },
      "source": [
        "## Setup\n",
        "\n",
        "First, let's install the required libraries:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "metadata": {
        "id": "mXyJWdrbg6T7"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Requirement already satisfied: extract-thinker in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (0.0.16)\n",
            "Requirement already satisfied: azure-ai-formrecognizer<4.0.0,>=3.3.3 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from extract-thinker) (3.3.3)\n",
            "Requirement already satisfied: boto3<2.0.0,>=1.34.161 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from extract-thinker) (1.34.161)\n",
            "Requirement already satisfied: cachetools<6.0.0,>=5.3.3 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from extract-thinker) (5.3.3)\n",
            "Requirement already satisfied: google-cloud-documentai<3.0.0,>=2.29.1 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from extract-thinker) (2.29.1)\n",
            "Requirement already satisfied: instructor<2.0.0,>=1.2.2 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from extract-thinker) (1.3.3)\n",
            "Requirement already satisfied: litellm<2.0.0,>=1.35.20 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from extract-thinker) (1.40.8)\n",
            "Requirement already satisfied: openpyxl<4.0.0,>=3.1.2 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from extract-thinker) (3.1.4)\n",
            "Requirement already satisfied: pdfplumber<0.12.0,>=0.11.4 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from extract-thinker) (0.11.4)\n",
            "Requirement already satisfied: pillow<11.0.0,>=10.3.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from extract-thinker) (10.3.0)\n",
            "Requirement already satisfied: pydantic<3.0.0,>=2.7.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from extract-thinker) (2.7.4)\n",
            "Requirement already satisfied: pypdf2<4.0.0,>=3.0.1 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from extract-thinker) (3.0.1)\n",
            "Requirement already satisfied: pypdfium2<5.0.0,>=4.29.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from extract-thinker) (4.30.0)\n",
            "Requirement already satisfied: pytesseract<0.4.0,>=0.3.10 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from extract-thinker) (0.3.10)\n",
            "Requirement already satisfied: python-docx<2.0.0,>=1.1.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from extract-thinker) (1.1.2)\n",
            "Requirement already satisfied: python-dotenv<2.0.0,>=1.0.1 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from extract-thinker) (1.0.1)\n",
            "Requirement already satisfied: pyyaml<7.0.0,>=6.0.1 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from extract-thinker) (6.0.1)\n",
            "Requirement already satisfied: tiktoken<0.7.0,>=0.6.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from extract-thinker) (0.6.0)\n",
            "Requirement already satisfied: xlrd<3.0.0,>=2.0.1 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from extract-thinker) (2.0.1)\n",
            "Requirement already satisfied: azure-core>=1.23.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from azure-ai-formrecognizer<4.0.0,>=3.3.3->extract-thinker) (1.30.2)\n",
            "Requirement already satisfied: msrest>=0.6.21 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from azure-ai-formrecognizer<4.0.0,>=3.3.3->extract-thinker) (0.7.1)\n",
            "Requirement already satisfied: azure-common>=1.1 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from azure-ai-formrecognizer<4.0.0,>=3.3.3->extract-thinker) (1.1.28)\n",
            "Requirement already satisfied: typing-extensions>=4.0.1 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from azure-ai-formrecognizer<4.0.0,>=3.3.3->extract-thinker) (4.12.2)\n",
            "Requirement already satisfied: botocore<1.35.0,>=1.34.161 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from boto3<2.0.0,>=1.34.161->extract-thinker) (1.34.161)\n",
            "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from boto3<2.0.0,>=1.34.161->extract-thinker) (1.0.1)\n",
            "Requirement already satisfied: s3transfer<0.11.0,>=0.10.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from boto3<2.0.0,>=1.34.161->extract-thinker) (0.10.2)\n",
            "Requirement already satisfied: google-api-core!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.1 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.1->google-cloud-documentai<3.0.0,>=2.29.1->extract-thinker) (2.19.1)\n",
            "Requirement already satisfied: google-auth!=2.24.0,!=2.25.0,<3.0.0dev,>=2.14.1 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from google-cloud-documentai<3.0.0,>=2.29.1->extract-thinker) (2.30.0)\n",
            "Requirement already satisfied: proto-plus<2.0.0dev,>=1.22.3 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from google-cloud-documentai<3.0.0,>=2.29.1->extract-thinker) (1.24.0)\n",
            "Requirement already satisfied: protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from google-cloud-documentai<3.0.0,>=2.29.1->extract-thinker) (4.25.3)\n",
            "Requirement already satisfied: aiohttp<4.0.0,>=3.9.1 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from instructor<2.0.0,>=1.2.2->extract-thinker) (3.9.5)\n",
            "Requirement already satisfied: docstring-parser<0.17,>=0.16 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from instructor<2.0.0,>=1.2.2->extract-thinker) (0.16)\n",
            "Requirement already satisfied: jiter<0.5.0,>=0.4.1 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from instructor<2.0.0,>=1.2.2->extract-thinker) (0.4.2)\n",
            "Requirement already satisfied: openai<2.0.0,>=1.1.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from instructor<2.0.0,>=1.2.2->extract-thinker) (1.34.0)\n",
            "Requirement already satisfied: pydantic-core<3.0.0,>=2.18.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from instructor<2.0.0,>=1.2.2->extract-thinker) (2.18.4)\n",
            "Requirement already satisfied: rich<14.0.0,>=13.7.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from instructor<2.0.0,>=1.2.2->extract-thinker) (13.7.1)\n",
            "Requirement already satisfied: tenacity<9.0.0,>=8.2.3 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from instructor<2.0.0,>=1.2.2->extract-thinker) (8.3.0)\n",
            "Requirement already satisfied: typer<1.0.0,>=0.9.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from instructor<2.0.0,>=1.2.2->extract-thinker) (0.12.3)\n",
            "Requirement already satisfied: click in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from litellm<2.0.0,>=1.35.20->extract-thinker) (8.1.7)\n",
            "Requirement already satisfied: importlib-metadata>=6.8.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from litellm<2.0.0,>=1.35.20->extract-thinker) (7.1.0)\n",
            "Requirement already satisfied: jinja2<4.0.0,>=3.1.2 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from litellm<2.0.0,>=1.35.20->extract-thinker) (3.1.4)\n",
            "Requirement already satisfied: requests<3.0.0,>=2.31.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from litellm<2.0.0,>=1.35.20->extract-thinker) (2.32.3)\n",
            "Requirement already satisfied: tokenizers in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from litellm<2.0.0,>=1.35.20->extract-thinker) (0.19.1)\n",
            "Requirement already satisfied: et-xmlfile in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from openpyxl<4.0.0,>=3.1.2->extract-thinker) (1.1.0)\n",
            "Requirement already satisfied: pdfminer.six==20231228 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from pdfplumber<0.12.0,>=0.11.4->extract-thinker) (20231228)\n",
            "Requirement already satisfied: charset-normalizer>=2.0.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from pdfminer.six==20231228->pdfplumber<0.12.0,>=0.11.4->extract-thinker) (3.3.2)\n",
            "Requirement already satisfied: cryptography>=36.0.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from pdfminer.six==20231228->pdfplumber<0.12.0,>=0.11.4->extract-thinker) (43.0.3)\n",
            "Requirement already satisfied: annotated-types>=0.4.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from pydantic<3.0.0,>=2.7.0->extract-thinker) (0.7.0)\n",
            "Requirement already satisfied: packaging>=21.3 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from pytesseract<0.4.0,>=0.3.10->extract-thinker) (24.1)\n",
            "Requirement already satisfied: lxml>=3.1.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from python-docx<2.0.0,>=1.1.0->extract-thinker) (5.2.2)\n",
            "Requirement already satisfied: regex>=2022.1.18 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from tiktoken<0.7.0,>=0.6.0->extract-thinker) (2024.5.15)\n",
            "Requirement already satisfied: aiosignal>=1.1.2 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from aiohttp<4.0.0,>=3.9.1->instructor<2.0.0,>=1.2.2->extract-thinker) (1.3.1)\n",
            "Requirement already satisfied: attrs>=17.3.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from aiohttp<4.0.0,>=3.9.1->instructor<2.0.0,>=1.2.2->extract-thinker) (23.2.0)\n",
            "Requirement already satisfied: frozenlist>=1.1.1 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from aiohttp<4.0.0,>=3.9.1->instructor<2.0.0,>=1.2.2->extract-thinker) (1.4.1)\n",
            "Requirement already satisfied: multidict<7.0,>=4.5 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from aiohttp<4.0.0,>=3.9.1->instructor<2.0.0,>=1.2.2->extract-thinker) (6.0.5)\n",
            "Requirement already satisfied: yarl<2.0,>=1.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from aiohttp<4.0.0,>=3.9.1->instructor<2.0.0,>=1.2.2->extract-thinker) (1.9.4)\n",
            "Requirement already satisfied: six>=1.11.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from azure-core>=1.23.0->azure-ai-formrecognizer<4.0.0,>=3.3.3->extract-thinker) (1.16.0)\n",
            "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from botocore<1.35.0,>=1.34.161->boto3<2.0.0,>=1.34.161->extract-thinker) (2.9.0.post0)\n",
            "Requirement already satisfied: urllib3!=2.2.0,<3,>=1.25.4 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from botocore<1.35.0,>=1.34.161->boto3<2.0.0,>=1.34.161->extract-thinker) (2.2.1)\n",
            "Requirement already satisfied: googleapis-common-protos<2.0.dev0,>=1.56.2 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.1->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.1->google-cloud-documentai<3.0.0,>=2.29.1->extract-thinker) (1.63.2)\n",
            "Requirement already satisfied: grpcio<2.0dev,>=1.33.2 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.1->google-cloud-documentai<3.0.0,>=2.29.1->extract-thinker) (1.64.1)\n",
            "Requirement already satisfied: grpcio-status<2.0.dev0,>=1.33.2 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.1->google-cloud-documentai<3.0.0,>=2.29.1->extract-thinker) (1.62.2)\n",
            "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from google-auth!=2.24.0,!=2.25.0,<3.0.0dev,>=2.14.1->google-cloud-documentai<3.0.0,>=2.29.1->extract-thinker) (0.4.0)\n",
            "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from google-auth!=2.24.0,!=2.25.0,<3.0.0dev,>=2.14.1->google-cloud-documentai<3.0.0,>=2.29.1->extract-thinker) (4.9)\n",
            "Requirement already satisfied: zipp>=0.5 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from importlib-metadata>=6.8.0->litellm<2.0.0,>=1.35.20->extract-thinker) (3.19.2)\n",
            "Requirement already satisfied: MarkupSafe>=2.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from jinja2<4.0.0,>=3.1.2->litellm<2.0.0,>=1.35.20->extract-thinker) (2.1.5)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from msrest>=0.6.21->azure-ai-formrecognizer<4.0.0,>=3.3.3->extract-thinker) (2024.6.2)\n",
            "Requirement already satisfied: isodate>=0.6.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from msrest>=0.6.21->azure-ai-formrecognizer<4.0.0,>=3.3.3->extract-thinker) (0.6.1)\n",
            "Requirement already satisfied: requests-oauthlib>=0.5.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from msrest>=0.6.21->azure-ai-formrecognizer<4.0.0,>=3.3.3->extract-thinker) (2.0.0)\n",
            "Requirement already satisfied: anyio<5,>=3.5.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from openai<2.0.0,>=1.1.0->instructor<2.0.0,>=1.2.2->extract-thinker) (4.4.0)\n",
            "Requirement already satisfied: distro<2,>=1.7.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from openai<2.0.0,>=1.1.0->instructor<2.0.0,>=1.2.2->extract-thinker) (1.9.0)\n",
            "Requirement already satisfied: httpx<1,>=0.23.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from openai<2.0.0,>=1.1.0->instructor<2.0.0,>=1.2.2->extract-thinker) (0.27.0)\n",
            "Requirement already satisfied: sniffio in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from openai<2.0.0,>=1.1.0->instructor<2.0.0,>=1.2.2->extract-thinker) (1.3.1)\n",
            "Requirement already satisfied: tqdm>4 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from openai<2.0.0,>=1.1.0->instructor<2.0.0,>=1.2.2->extract-thinker) (4.66.4)\n",
            "Requirement already satisfied: idna<4,>=2.5 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from requests<3.0.0,>=2.31.0->litellm<2.0.0,>=1.35.20->extract-thinker) (3.7)\n",
            "Requirement already satisfied: markdown-it-py>=2.2.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from rich<14.0.0,>=13.7.0->instructor<2.0.0,>=1.2.2->extract-thinker) (3.0.0)\n",
            "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from rich<14.0.0,>=13.7.0->instructor<2.0.0,>=1.2.2->extract-thinker) (2.18.0)\n",
            "Requirement already satisfied: shellingham>=1.3.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from typer<1.0.0,>=0.9.0->instructor<2.0.0,>=1.2.2->extract-thinker) (1.5.4)\n",
            "Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from tokenizers->litellm<2.0.0,>=1.35.20->extract-thinker) (0.23.3)\n",
            "Requirement already satisfied: cffi>=1.12 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from cryptography>=36.0.0->pdfminer.six==20231228->pdfplumber<0.12.0,>=0.11.4->extract-thinker) (1.17.1)\n",
            "Requirement already satisfied: httpcore==1.* in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from httpx<1,>=0.23.0->openai<2.0.0,>=1.1.0->instructor<2.0.0,>=1.2.2->extract-thinker) (1.0.5)\n",
            "Requirement already satisfied: h11<0.15,>=0.13 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from httpcore==1.*->httpx<1,>=0.23.0->openai<2.0.0,>=1.1.0->instructor<2.0.0,>=1.2.2->extract-thinker) (0.14.0)\n",
            "Requirement already satisfied: filelock in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.35.20->extract-thinker) (3.15.1)\n",
            "Requirement already satisfied: fsspec>=2023.5.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.35.20->extract-thinker) (2024.6.0)\n",
            "Requirement already satisfied: mdurl~=0.1 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from markdown-it-py>=2.2.0->rich<14.0.0,>=13.7.0->instructor<2.0.0,>=1.2.2->extract-thinker) (0.1.2)\n",
            "Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from pyasn1-modules>=0.2.1->google-auth!=2.24.0,!=2.25.0,<3.0.0dev,>=2.14.1->google-cloud-documentai<3.0.0,>=2.29.1->extract-thinker) (0.6.0)\n",
            "Requirement already satisfied: oauthlib>=3.0.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from requests-oauthlib>=0.5.0->msrest>=0.6.21->azure-ai-formrecognizer<4.0.0,>=3.3.3->extract-thinker) (3.2.2)\n",
            "Requirement already satisfied: pycparser in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six==20231228->pdfplumber<0.12.0,>=0.11.4->extract-thinker) (2.22)\n",
            "\n",
            "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n",
            "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
          ]
        }
      ],
      "source": [
        "!pip install extract-thinker"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n",
        "Then set up the OpenAI API key:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "metadata": {},
      "outputs": [],
      "source": [
        "# My OpenAI Key\n",
        "import os\n",
        "\n",
        "os.environ[\"OPENAI_API_KEY\"] = \"\""
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Create a DocumentLoader. \n",
        "\n",
        "To make it simple, we'll use PyPDF\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 11,
      "metadata": {},
      "outputs": [],
      "source": [
        "from extract_thinker import DocumentLoaderPyPdf\n",
        "\n",
        "document_loader = DocumentLoaderPyPdf()\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Create an Extractor \n",
        "An extractor is the main class that coordinates the extraction process. It needs a DocumentLoader and an LLM."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 12,
      "metadata": {},
      "outputs": [],
      "source": [
        "from extract_thinker import Extractor\n",
        "\n",
        "extractor = Extractor()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Load the document loader into the extractor and the LLM"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 13,
      "metadata": {},
      "outputs": [],
      "source": [
        "extractor.load_document_loader(document_loader)\n",
        "\n",
        "extractor.load_llm(\"gpt-4o-mini\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Define the contract and extract the data"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 14,
      "metadata": {},
      "outputs": [],
      "source": [
        "from extract_thinker import Contract\n",
        "\n",
        "class InvoiceContract(Contract):\n",
        "    invoice_number: str\n",
        "    invoice_date: str"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Requirement already satisfied: ipywidgets in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (8.1.5)\n",
            "Requirement already satisfied: comm>=0.1.3 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from ipywidgets) (0.2.2)\n",
            "Requirement already satisfied: ipython>=6.1.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from ipywidgets) (8.18.1)\n",
            "Requirement already satisfied: traitlets>=4.3.1 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from ipywidgets) (5.14.3)\n",
            "Requirement already satisfied: widgetsnbextension~=4.0.12 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from ipywidgets) (4.0.13)\n",
            "Requirement already satisfied: jupyterlab-widgets~=3.0.12 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from ipywidgets) (3.0.13)\n",
            "Requirement already satisfied: decorator in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets) (5.1.1)\n",
            "Requirement already satisfied: jedi>=0.16 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets) (0.19.1)\n",
            "Requirement already satisfied: matplotlib-inline in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets) (0.1.7)\n",
            "Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.41 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets) (3.0.48)\n",
            "Requirement already satisfied: pygments>=2.4.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets) (2.18.0)\n",
            "Requirement already satisfied: stack-data in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets) (0.6.3)\n",
            "Requirement already satisfied: pexpect>4.3 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets) (4.9.0)\n",
            "Requirement already satisfied: parso<0.9.0,>=0.8.3 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets) (0.8.4)\n",
            "Requirement already satisfied: ptyprocess>=0.5 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from pexpect>4.3->ipython>=6.1.0->ipywidgets) (0.7.0)\n",
            "Requirement already satisfied: wcwidth in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from prompt-toolkit<3.1.0,>=3.0.41->ipython>=6.1.0->ipywidgets) (0.2.13)\n",
            "Requirement already satisfied: executing>=1.2.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (2.1.0)\n",
            "Requirement already satisfied: asttokens>=2.1.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (2.4.1)\n",
            "Requirement already satisfied: pure-eval in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (0.2.3)\n",
            "Requirement already satisfied: six>=1.12.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from asttokens>=2.1.0->stack-data->ipython>=6.1.0->ipywidgets) (1.16.0)\n",
            "\n",
            "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n",
            "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
            "Requirement already satisfied: IPython in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (8.18.1)\n",
            "Requirement already satisfied: decorator in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from IPython) (5.1.1)\n",
            "Requirement already satisfied: jedi>=0.16 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from IPython) (0.19.1)\n",
            "Requirement already satisfied: matplotlib-inline in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from IPython) (0.1.7)\n",
            "Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.41 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from IPython) (3.0.48)\n",
            "Requirement already satisfied: pygments>=2.4.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from IPython) (2.18.0)\n",
            "Requirement already satisfied: stack-data in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from IPython) (0.6.3)\n",
            "Requirement already satisfied: traitlets>=5 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from IPython) (5.14.3)\n",
            "Requirement already satisfied: pexpect>4.3 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from IPython) (4.9.0)\n",
            "Requirement already satisfied: parso<0.9.0,>=0.8.3 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from jedi>=0.16->IPython) (0.8.4)\n",
            "Requirement already satisfied: ptyprocess>=0.5 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from pexpect>4.3->IPython) (0.7.0)\n",
            "Requirement already satisfied: wcwidth in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from prompt-toolkit<3.1.0,>=3.0.41->IPython) (0.2.13)\n",
            "Requirement already satisfied: executing>=1.2.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from stack-data->IPython) (2.1.0)\n",
            "Requirement already satisfied: asttokens>=2.1.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from stack-data->IPython) (2.4.1)\n",
            "Requirement already satisfied: pure-eval in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from stack-data->IPython) (0.2.3)\n",
            "Requirement already satisfied: six>=1.12.0 in /Users/regina/Desktop/ExtractThinker/.venv/lib/python3.12/site-packages (from asttokens>=2.1.0->stack-data->IPython) (1.16.0)\n",
            "\n",
            "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n",
            "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
          ]
        }
      ],
      "source": [
        "!pip install ipywidgets\n",
        "!pip install IPython"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": []
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Extract Data from Uploaded File according to the contract"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "metadata": {},
      "outputs": [
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "fd90d1f34cc6463f8b86384f18848597",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "FileUpload(value=(), accept='.pdf', description='Upload PDF')"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "6c688cc538dd42febb2be743f0007df5",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Output()"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "import ipywidgets as widgets\n",
        "from IPython.display import display\n",
        "import tempfile\n",
        "\n",
        "# Create file upload widget and output widget\n",
        "file_upload = widgets.FileUpload(accept='.pdf', description='Upload PDF')\n",
        "output = widgets.Output()\n",
        "\n",
        "def on_file_uploaded(change):\n",
        "    # Only proceed if there's a new file uploaded\n",
        "    if change['new']:\n",
        "        # Get the first uploaded file\n",
        "        uploaded_file = next(iter(change['new'].values()))\n",
        "        \n",
        "        # Display information within the output widget\n",
        "        with output:\n",
        "            print(f\"File uploaded: {uploaded_file['metadata']['name']}\")\n",
        "            print(f\"File size: {len(uploaded_file['content'])} bytes\")\n",
        "\n",
        "            with tempfile.NamedTemporaryFile(delete=False, suffix=\".pdf\") as temp_file:\n",
        "                temp_file.write(uploaded_file['content'])\n",
        "                temp_file_path = temp_file.name\n",
        "\n",
        "                result = extractor.extract(temp_file_path, InvoiceContract)\n",
        "                print(result)\n",
        "\n",
        "# Observe changes to the file upload widget's value\n",
        "file_upload.observe(on_file_uploaded, names='value')\n",
        "\n",
        "# Display both widgets\n",
        "display(file_upload, output)"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.12.7"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
